* code to merge pregnancies to infants
global path = "/Users/jhainmueller/Box Sync/JHMedBox"
clear all
set more off 


* set up infant data for merge
* all claims pulled for claimaints aged 0
cd "$path/cawem_wkdata/data/pregkidmerge"
use allkids.dta, clear

* SAK_RECIP: claimaint ID
* daten: date of claim
* dob: date of birth
* ADR_SVC_ZIP: zip code of service (claim)
* ADR_ZIP_RES: zip code of claimant (enrollment)
* SAK_CASE: household ID
* daysincebirth: days from dob to date of first service
* CDE_RACE: race of claimant
* CDE_ETHNIC: ethnicity of claimant
* multibirth: potential multi birth 

capt drop tag1 rowobs
gen  rowobs = _n
sort SAK_RECIP daten rowobs , stable

* identify first day with claims 
egen mindaten = min(daten) , by(SAK_RECIP)
gen  mindocl  = mindaten == daten
keep if mindocl == 1

* take mode of matching vars across claims
foreach x of varlist ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE daysincebirth CDE_RACE CDE_ETHNIC multibirth  {
 qui: egen rc_`x' = mode(`x') if mindocl == 1  , by(SAK_RECIP) minmode 
}

foreach x of varlist ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE daysincebirth CDE_RACE CDE_ETHNIC multibirth  {
  qui: drop   `x'
  qui: rename rc_`x' `x'
}

* reduce data to one row per infant
sort SAK_RECIP daten rowobs , stable
egen tag1 = tag(SAK_R)
keep if tag1

keep    SAK_RECIP dob ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE daysincebirth CDE_RACE CDE_ETHNIC multibirth 
renvars SAK_RECIP dob ADR_SVC_ZIP ADR_ZIP_RES daysincebirth CDE_RACE CDE_ETHNIC multibirth, prefix(k_)

* create mergedate as infant dob
gen  mergedate = k_dob
cd "$path/cawem_wkdata/data/pregkidmerge"
* data to be used for merge (row is one infant)
save kidsformerge.dta, replace 


* set up pregnancies data for merge
* all claims from all female claimants aged 12-51
cd "$path/cawem_wkdata/data/working"
use women.dta, clear

* pregdate: date of delivery (or other end of pregnancy)
* pregday: indicator for day of delivery
* pregtype: type of pregnancy outcome (live birth, etc.)
* pregno: pregnancy counter
* insurance: insurance group

* reduce to day of delivery
keep if pregday == 1

* take mode of matching vars across claims
foreach x of varlist ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE CDE_RACE CDE_ETHNIC multibirth insurance  {
  qui: egen rc_`x' = mode(`x') if pregday == 1 , by(SAK_RECIP pregno) minmode missing
}

foreach x of varlist ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE CDE_RACE CDE_ETHNIC multibirth insurance  {
  qui: drop   `x'
  qui: rename rc_`x' `x'
}

* reduce to one row per preg
capt drop rowobs
gen  rowobs = _n
sort SAK_RECIP daten rowobs , stable
egen tag1 = tag(SAK_RECIP pregno)
keep if tag1

keep SAK_RECIP pregdate pregtype pregno ADR_SVC_ZIP ADR_ZIP_RES SAK_CASE CDE_RACE CDE_ETHNIC multibirth insurance

* limit to pregnancies where a birth is expected (live birth, including pre-term birth)
keep if pregtype=="LB" | pregtype=="LBANDSB" | pregtype=="LBORSB" | pregtype=="PTB"
* create mergedate as date of delivery
gen mergedate = pregdate
cd "$path/cawem_wkdata/data/pregkidmerge"
* data to be used for merge (row is one delivery)
save allpregsformerge.dta, replace 


/// matching of moms and kids
cd "$path/cawem_wkdata/data/pregkidmerge"
set more off
* load all infants
use  kidsformerge, clear
* set up dataset of infants to be merged
save kidsleft, replace
* clean up
capture rm   matched.dta


/// first set of matches: 
// exact on HH ID AND date of delivery matches infant date of birth or falls within following two days (to accomodate longer deliveries)

* load pregnancies
use allpregsformerge.dta, clear
renvars SAK_RECIP ADR_SVC_ZIP ADR_ZIP_RES CDE_RACE CDE_ETHNIC pregtype pregdate pregno multibirth insurance, prefix(m_)

* loop over distance between date of delivery and infant date of birth
foreach i of numlist 0 1 2  {

* recode merge date to accomodate distance
replace   mergedate = mergedate + `i'
* match on HH ID and mergedate
joinby SAK_C mergedate using kidsleft , unmatched(both)

* process matches
preserve 
keep if _merge==3
* mark multi matches
duplicates tag SAK_C mergedate, gen(dupe)
gen multimatch = dupe
drop dupe
gen m_SAK_CASE = SAK_C
gen k_SAK_CASE = SAK_C
drop mergedate
* code match quality
gen  match = "HHID and day `i'"

* store in matched dataset (created in first iteration) 
cap append using matched
save matched, replace
restore

* process infants that remain unmatched
preserve 
keep if _merge==2
drop    mergedate
gen     mergedate = k_dob
keep    k_* mergedate SAK_CASE
* put them back into the pool of available matches
save    kidsleft, replace
restore

* process pregnancies that remain unmatched
keep if _merge==1
drop _merge
drop mergedate
drop k_*
* reset merge date to date of delivery
gen  mergedate = m_pregdate
}
* loop ends on data for pregnancies that remain unmatched in storage

/// second set of matches: 
// exact on service and enrollment zips, ethnicity, and race AND
// date of delivery matches infant date of birth or falls within following two days (to accomodate longer deliveries)


* rename matching variable to have common name 
gen    k_ADR_SVC_ZIP = m_ADR_SVC_ZIP  
gen    k_ADR_ZIP_RES = m_ADR_ZIP_RES 

gen    k_CDE_RACE =   m_CDE_RACE    
gen    k_CDE_ETHNIC = m_CDE_ETHNIC 

gen  m_SAK_CASE = SAK_C
drop SAK_C

* loop over distance between date of delivery and infant date of birth
foreach i of numlist 0 1 2 {

* recode merge date to accomodate distance
replace   mergedate = mergedate + `i'
* match on zips, ethnicity, and race and mergedate
joinby mergedate k_ADR_SVC_ZIP k_ADR_ZIP_RES k_CDE_RACE k_CDE_ETHNIC using kidsleft , unmatched(both)

* process matches
preserve 
keep if _merge==3
duplicates tag mergedate k_ADR_SVC_ZIP k_ADR_ZIP_RES k_CDE_RACE k_CDE_ETHNIC, gen(dupe)
gen  multimatch = dupe
drop dupe
gen  k_SAK_CASE = SAK_C
drop mergedate
gen  match = "Zips, Eth, Race, and day `i'"
cap  append using matched
save matched, replace
restore

* process infants that remain unmatched
preserve 
keep if _merge==2
drop mergedate
gen  mergedate = k_dob
keep k_* mergedate SAK_CASE
save kidsleft, replace
restore

* process pregnancies that remain unmatched
keep if _merge==1
keep m_* k_ADR_SVC_ZIP k_ADR_ZIP_RES k_CDE_RACE k_CDE_ETHNIC
gen  mergedate = m_pregdate
}

* loop ends on data for pregnancies that remain unmatched in storage

* mark unmatched pregnancies
gen    match = "not matched"
* combine with matched pregnancies
append using matched
save   matched, replace
tab match, mis

* drop infants who are merged more than one pregnancy
duplicates tag k_SAK_R , gen(kdupe)
drop if kdupe>0 & k_SAK_R!=""

* drop duplcate merges (mom or infant) unless qualitative info indicates multi birth 
gen     multib = k_multibirth==1 | m_multibirth==1
tab     multimatch multib, mis
drop if multimatch>0 & multimatch!=. & multib==0

save   matched, replace
tab match, mis

* code valid matches
gen matchgood =  match=="HHID and day 0" | match=="HHID and day 1" | match=="HHID and day 2" | ///
        match == "Zips, Eth, Race, and day 0" | match=="Zips, Eth, Race, and day 1" | match=="Zips, Eth, Race, and day 2"


* check match rate
tab matchgood, mis

* by group
bys m_insurance: tab matchgood, mis

* by year:
gen yopreg = year(m_pregdate)
tabstat matchgood, by(yopreg )

* by group:
bys m_insurance: tabstat matchgood, by(yopreg )
drop yopreg
		
* drop unmatched
drop if match=="not matched"
keep m_SAK_R k_SAK_R m_pregno match

* keep only single birth 
duplicates tag m_SAK_R m_pregno, gen(mult)
keep if mult == 0
rename m_pregno pregno
drop mult

tab match
 
cap drop rowobs
gen rowobs = _n
sort m_SAK_RECIP pregno rowobs , stable 
 
* save crosswalk (sample infants with matchgood==1)
save crosswalkpregtokid, replace



